{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "## Deep Learning - Flower Image Classification"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml import Transformer, Estimator, Pipeline\n",
    "from pyspark.ml.classification import LogisticRegression\n",
    "from synapse.ml.downloader import ModelDownloader\n",
    "import sys, time\n",
    "from pyspark.sql import SparkSession\n",
    "\n",
    "# Bootstrap Spark Session\n",
    "spark = SparkSession.builder.getOrCreate()\n",
    "\n",
    "from synapse.ml.core.platform import running_on_synapse, running_on_databricks\n",
    "\n",
    "if running_on_synapse():\n",
    "    from notebookutils.visualization import display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "mml-deploy": "local"
   },
   "outputs": [],
   "source": [
    "if running_on_synapse():\n",
    "    modelDir = \"abfss://synapse@mmlsparkeuap.dfs.core.windows.net/models/\"\n",
    "elif running_on_databricks():\n",
    "    modelDir = \"dbfs:/models/\"\n",
    "else:\n",
    "    modelDir = \"/tmp/models/\"\n",
    "\n",
    "model = ModelDownloader(spark, modelDir).downloadByName(\"ResNet50\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the images\n",
    "# use flowers_and_labels.parquet on larger cluster in order to get better results\n",
    "imagesWithLabels = (\n",
    "    spark.read.parquet(\n",
    "        \"wasbs://publicwasb@mmlspark.blob.core.windows.net/flowers_and_labels2.parquet\"\n",
    "    )\n",
    "    .withColumnRenamed(\"bytes\", \"image\")\n",
    "    .sample(0.1)\n",
    ")\n",
    "\n",
    "imagesWithLabels.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Smiley face](https://i.imgur.com/p2KgdYL.jpg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from synapse.ml.opencv import ImageTransformer\n",
    "from synapse.ml.image import UnrollImage\n",
    "from synapse.ml.cntk import ImageFeaturizer\n",
    "from synapse.ml.stages import *\n",
    "\n",
    "# Make some featurizers\n",
    "it = ImageTransformer().setOutputCol(\"scaled\").resize(size=(60, 60))\n",
    "\n",
    "ur = UnrollImage().setInputCol(\"scaled\").setOutputCol(\"features\")\n",
    "\n",
    "dc1 = DropColumns().setCols([\"scaled\", \"image\"])\n",
    "\n",
    "lr1 = (\n",
    "    LogisticRegression().setMaxIter(8).setFeaturesCol(\"features\").setLabelCol(\"labels\")\n",
    ")\n",
    "\n",
    "dc2 = DropColumns().setCols([\"features\"])\n",
    "\n",
    "basicModel = Pipeline(stages=[it, ur, dc1, lr1, dc2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "resnet = (\n",
    "    ImageFeaturizer()\n",
    "    .setInputCol(\"image\")\n",
    "    .setOutputCol(\"features\")\n",
    "    .setModelLocation(model.uri)\n",
    "    .setLayerNames(model.layerNames)\n",
    "    .setCutOutputLayers(1)\n",
    ")\n",
    "\n",
    "dc3 = DropColumns().setCols([\"image\"])\n",
    "\n",
    "lr2 = (\n",
    "    LogisticRegression().setMaxIter(8).setFeaturesCol(\"features\").setLabelCol(\"labels\")\n",
    ")\n",
    "\n",
    "dc4 = DropColumns().setCols([\"features\"])\n",
    "\n",
    "deepModel = Pipeline(stages=[resnet, dc3, lr2, dc4])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Resnet 18](https://i.imgur.com/Mb4Dyou.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How does it work?\n\n![Convolutional network weights](http://i.stack.imgur.com/Hl2H6.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run the experiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def timedExperiment(model, train, test):\n",
    "    start = time.time()\n",
    "    result = model.fit(train).transform(test).toPandas()\n",
    "    print(\"Experiment took {}s\".format(time.time() - start))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, test = imagesWithLabels.randomSplit([0.8, 0.2])\n",
    "train.count(), test.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "basicResults = timedExperiment(basicModel, train, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "deepResults = timedExperiment(deepModel, train, test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Plot confusion matrix."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import confusion_matrix\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "def evaluate(results, name):\n",
    "    y, y_hat = results[\"labels\"], results[\"prediction\"]\n",
    "    y = [int(l) for l in y]\n",
    "\n",
    "    accuracy = np.mean([1.0 if pred == true else 0.0 for (pred, true) in zip(y_hat, y)])\n",
    "    cm = confusion_matrix(y, y_hat)\n",
    "    cm = cm.astype(\"float\") / cm.sum(axis=1)[:, np.newaxis]\n",
    "\n",
    "    plt.text(\n",
    "        40, 10, \"$Accuracy$ $=$ ${}\\%$\".format(round(accuracy * 100, 1)), fontsize=14\n",
    "    )\n",
    "    plt.imshow(cm, interpolation=\"nearest\", cmap=plt.cm.Blues)\n",
    "    plt.colorbar()\n",
    "    plt.xlabel(\"$Predicted$ $label$\", fontsize=18)\n",
    "    plt.ylabel(\"$True$ $Label$\", fontsize=18)\n",
    "    plt.title(\"$Normalized$ $CM$ $for$ ${}$\".format(name))\n",
    "\n",
    "\n",
    "plt.figure(figsize=(12, 5))\n",
    "plt.subplot(1, 2, 1)\n",
    "evaluate(deepResults, \"CNTKModel + LR\")\n",
    "plt.subplot(1, 2, 2)\n",
    "evaluate(basicResults, \"LR\")\n",
    "# Note that on the larger dataset the accuracy will bump up from 44% to >90%\n",
    "display(plt.show())"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
